In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import statsmodels.api as sm

Income and Wealth In The US

Wealth Distribution

In [2]:
# load US wealth and income distribution data

perc_wealth = pd.read_csv('data//percent_of_total_wealth.csv', index_col = 0) 
perc_wealth['Top 10%'] = perc_wealth['Top 1%'] + perc_wealth['90th-99th'] 
perc_wealth.tail()

gini_data = pd.read_csv('data//household_gini.csv') 
In [3]:
perc_wealth.head()
Out[3]:
Top 1% Bottom 50% 50th - 90th 90th-99th Top 10%
DATE
7/1/1989 23.6 3.7 35.5 37.2 60.8
10/1/1989 23.6 3.5 35.6 37.3 60.9
1/1/1990 23.5 3.7 35.8 37.0 60.5
4/1/1990 23.7 3.5 35.8 37.0 60.7
7/1/1990 23.2 3.7 36.1 37.0 60.2
In [4]:
gini_data.head()
Out[4]:
DATE GINI
0 1/1/1947 0.376
1 1/1/1948 0.371
2 1/1/1949 0.378
3 1/1/1950 0.379
4 1/1/1951 0.363
In [5]:
fig = px.line(perc_wealth, x=perc_wealth.index, y=['Top 10%','50th - 90th', 'Bottom 50%'],
             title = 'Shares of Total Net Worth Over Time',
             labels = { 'DATE': 'Date', 'value':'Percent of Total Net Worth', 'variable':'Wealth Percentile'})
fig.update_layout(hovermode="x")

fig.show()
In [6]:
# stacked bar chart form of above

fig2 = go.Figure(data=[
    go.Bar(name='Top 10%', x=perc_wealth.index, y=perc_wealth['Top 10%']),
    go.Bar(name='50th - 90th Percentile', x=perc_wealth.index, y=perc_wealth['50th - 90th']),
    go.Bar(name='Bottom 50%', x=perc_wealth.index, y=perc_wealth['Bottom 50%'])
])

fig2.update_layout(yaxis=dict(
        title_text="% of Total Net Worth",
        ticktext=["0%", "20%", "40%", "60%","80%","100%"],
        tickvals=[0, 20, 40, 60, 80, 100],
        tickmode="array",
        titlefont=dict(size=15),
    ),
    title = 'Shares of Total Net Worth Over Time',
    barmode='stack')
fig2.show()

Top 10% of wealth holders have been consistently increasing their share of total wealth from 60.3% in Q3 1989 to 69.6% Q3 2021.

Interestingly, while the middle 40% lost their share of wealth, the bottom 50% saw a steep increase in their share beginning in Q1 2021.

In [7]:
fig_bot = px.line(perc_wealth, x=perc_wealth.index, y='Bottom 50%',
             title = 'Share of Wealth held by Bottom 50%',
             labels = { 'DATE': 'Date', 'Bottom 50%':'Percent of Total Net Worth'})
fig_bot.update_layout(hovermode="x")

fig_bot.show()

Income Distribution

In [8]:
fig3 = px.line(gini_data, x='DATE', y='GINI',
             title = 'Household Income Gini Ratio',
             labels = { 'DATE': 'Date', 'GINI':'Gini Ratio'})
fig3.update_layout(hovermode="x")

fig3.show()

Since 1968, inequality in income has continued to rise.

How Inequality Relates to the Economy and Quality of Life

In [9]:
# load OECD and UN economic, QOL data

productivity = pd.read_csv('data//GDP_per_labor_hr.csv')
oecd_gini = pd.read_csv('data//oecd_gini.csv')
oecd_tax_rev = pd.read_csv('data//oecd_tax_rev_perGDP.csv')
happiness = pd.read_csv('data//whr_data.csv')
hdi = pd.read_csv('data//hdi.csv', index_col =0)
In [10]:
# merge OECD data into one dataframe

oecd_combined_data = oecd_gini.merge(oecd_tax_rev, how = 'left', on = ['LOCATION','TIME'])
oecd_combined_data = oecd_combined_data.merge(productivity, how = 'left', on = ['LOCATION','TIME'])
oecd_combined_data = oecd_combined_data[['LOCATION', 'TIME', 'Value_x', 'Value_y','Value']]
oecd_combined_data = oecd_combined_data.rename(columns = {'Value_x':'Gini','Value_y':'Tax Revenue (% of GDP)', 'Value':'Productivity (GDP/Labor Hr)'})
In [11]:
oecd_combined_data.head()
Out[11]:
LOCATION TIME Gini Tax Revenue (% of GDP) Productivity (GDP/Labor Hr)
0 AUS 2016 0.330 27.464 99.728309
1 AUS 2018 0.325 28.534 100.811475
2 AUT 2016 0.284 41.753 99.877092
3 AUT 2017 0.275 41.873 101.087913
4 AUT 2018 0.280 42.251 101.624302
In [12]:
# prepare World Happiness Report Data

happiness = happiness[['Country name','year','Life Ladder']]
happiness = happiness.rename(columns={'Life Ladder':'Happiness Score'})
happiness = happiness[happiness['year']==2018]
happiness.head()
Out[12]:
Country name year Happiness Score
0 Afghanistan 2018 2.694
1 Albania 2018 5.004
2 Algeria 2018 5.043
3 Argentina 2018 5.793
4 Armenia 2018 5.062
In [13]:
hdi.head()
Out[13]:
Country HDI (2018)
0 Norway 0.956
1 Ireland 0.951
2 Switzerland 0.955
3 Hong Kong 0.946
4 Iceland 0.946
In [14]:
# combine HDI and WHR data from UN

un_combined_data = happiness.merge(hdi, how = 'inner', left_on = 'Country name', right_on = 'Country')
un_combined_data = un_combined_data.drop(columns = ['Country'])

un_combined_data.head()
Out[14]:
Country name year Happiness Score HDI (2018)
0 Afghanistan 2018 2.694 0.509
1 Albania 2018 5.004 0.792
2 Algeria 2018 5.043 0.746
3 Argentina 2018 5.793 0.842
4 Armenia 2018 5.062 0.771
In [15]:
# select only OECD from 2018 since hdi data is only complete for 2018
oecd_combined_data = oecd_combined_data[oecd_combined_data['TIME']==2018]

# convert OECD abbreviations into UN-format country names
country_abbr = {'AUS':'Australia','AUT':'Austria','BEL':'Belgium','CAN':'Canada','CZE':'Czech Republic',
                'DNK':'Denmark','FRA':'France','DEU':'Germany','GRC':'Greece','HUN':'Hungary','LUX':'Luxembourg',
                'IRL':'Ireland','ITA':'Italy','NLD':'Netherlands','NOR':'Norway','PRT':'Portugal','SVK':'Slovakia','ESP':'Spain',
                'SWE':'Sweden','GBR':'United Kingdom','USA':'United States','EST':'Estonia','SVN':'Slovenia',
                'JPN':'Japan','KOR':'South Korea','MEX':'Mexico','LVA':'Latvia','LTU':'Lithuania','CRI':'Costa Rica',
                'CHE':'Switzerland','TUR':'Turkey','ISR':'Israel','SVN':'Slovenia','POL':'Poland','FIN':'Finland'}
oecd_combined_data = oecd_combined_data.replace(country_abbr)
In [16]:
oecd_combined_data
Out[16]:
LOCATION TIME Gini Tax Revenue (% of GDP) Productivity (GDP/Labor Hr)
1 Australia 2018 0.325 28.534 100.811475
4 Austria 2018 0.280 42.251 101.624302
6 Belgium 2018 0.258 43.869 100.133328
10 Canada 2018 0.303 33.506 102.274591
14 Czech Republic 2018 0.249 34.982 104.474820
18 Denmark 2018 0.263 44.175 105.350786
21 Finland 2018 0.269 42.386 103.725411
24 France 2018 0.301 45.878 102.814627
28 Germany 2018 0.289 38.430 103.203085
31 Greece 2018 0.306 40.000 93.617747
35 Hungary 2018 0.280 36.820 104.792472
41 Ireland 2018 0.292 22.354 109.265966
44 Italy 2018 0.330 41.725 100.323292
45 Japan 2018 0.334 31.552 101.549420
48 South Korea 2018 0.345 26.686 111.478289
51 Luxembourg 2018 0.318 39.468 98.822155
54 Mexico 2018 0.418 16.145 100.453035
57 Netherlands 2018 0.295 38.799 100.064801
61 Norway 2018 0.262 39.368 102.017169
65 Poland 2018 0.281 35.143 115.685002
68 Portugal 2018 0.317 34.656 100.781705
72 Slovakia 2018 0.236 34.198 105.077678
76 Spain 2018 0.330 34.662 101.099140
80 Sweden 2018 0.275 43.774 100.657132
84 Switzerland 2018 0.311 26.811 104.878716
86 Turkey 2018 0.397 23.983 109.870924
89 United Kingdom 2018 0.366 32.887 101.647994
93 United States 2018 0.393 24.894 102.326631
98 Estonia 2018 0.305 33.046 111.814508
102 Israel 2018 0.348 30.802 105.355190
105 Slovenia 2018 0.249 37.271 110.187721
109 Latvia 2018 0.351 31.144 108.900284
113 Lithuania 2018 0.361 30.227 108.600526
117 Costa Rica 2018 0.479 23.188 108.227453
In [17]:
# combine oecd and un dataframes into one
final_dataset = oecd_combined_data.merge(un_combined_data, how = 'left', left_on = 'LOCATION', right_on = 'Country name')
final_dataset = final_dataset.drop(columns = ['Country name','year'])
final_dataset = final_dataset.dropna()
In [18]:
final_dataset
Out[18]:
LOCATION TIME Gini Tax Revenue (% of GDP) Productivity (GDP/Labor Hr) Happiness Score HDI (2018)
0 Australia 2018 0.325 28.534 100.811475 7.177 0.943
1 Austria 2018 0.280 42.251 101.624302 7.396 0.921
2 Belgium 2018 0.258 43.869 100.133328 6.892 0.930
3 Canada 2018 0.303 33.506 102.274591 7.175 0.928
4 Czech Republic 2018 0.249 34.982 104.474820 7.034 0.898
5 Denmark 2018 0.263 44.175 105.350786 7.649 0.939
6 Finland 2018 0.269 42.386 103.725411 7.858 0.937
7 France 2018 0.301 45.878 102.814627 6.666 0.898
8 Germany 2018 0.289 38.430 103.203085 7.118 0.946
9 Greece 2018 0.306 40.000 93.617747 5.409 0.881
10 Hungary 2018 0.280 36.820 104.792472 5.936 0.850
11 Ireland 2018 0.292 22.354 109.265966 6.962 0.951
12 Italy 2018 0.330 41.725 100.323292 6.517 0.890
13 Japan 2018 0.334 31.552 101.549420 5.794 0.917
14 South Korea 2018 0.345 26.686 111.478289 5.840 0.913
15 Luxembourg 2018 0.318 39.468 98.822155 7.243 0.914
16 Mexico 2018 0.418 16.145 100.453035 6.550 0.776
17 Netherlands 2018 0.295 38.799 100.064801 7.463 0.942
18 Norway 2018 0.262 39.368 102.017169 7.444 0.956
19 Poland 2018 0.281 35.143 115.685002 6.111 0.877
20 Portugal 2018 0.317 34.656 100.781705 5.920 0.860
21 Slovakia 2018 0.236 34.198 105.077678 6.235 0.858
22 Spain 2018 0.330 34.662 101.099140 6.513 0.905
23 Sweden 2018 0.275 43.774 100.657132 7.375 0.943
24 Switzerland 2018 0.311 26.811 104.878716 7.509 0.955
25 Turkey 2018 0.397 23.983 109.870924 5.186 0.817
26 United Kingdom 2018 0.366 32.887 101.647994 7.233 0.928
27 United States 2018 0.393 24.894 102.326631 6.883 0.925
28 Estonia 2018 0.305 33.046 111.814508 6.091 0.889
29 Israel 2018 0.348 30.802 105.355190 6.927 0.916
30 Slovenia 2018 0.249 37.271 110.187721 6.249 0.912
31 Latvia 2018 0.351 31.144 108.900284 5.901 0.863
32 Lithuania 2018 0.361 30.227 108.600526 6.309 0.876
33 Costa Rica 2018 0.479 23.188 108.227453 7.141 0.808
In [19]:
# create scatter plot matrix to examine associations between predictors
fig4 = px.scatter_matrix(final_dataset,
    dimensions=['Gini', 'Tax Revenue (% of GDP)',
       'Productivity (GDP/Labor Hr)', 'Happiness Score', 'HDI (2018)'], height=1500, width=1500)
fig4.update_traces(diagonal_visible=False)
fig4.show()

Gini appears to have the strongest linear associations with tax revenue and HDI.

In [20]:
# plot Tax Revenue and HDI specifically

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig5 = make_subplots(rows=1, cols=2)

fig5.add_trace(
    go.Scatter(y=final_dataset['Gini'], x=final_dataset['Tax Revenue (% of GDP)'], text = final_dataset['LOCATION'],
               mode="markers"),
    row=1, col=1
)

fig5.add_trace(
    go.Scatter(y=final_dataset['Gini'], x=final_dataset['HDI (2018)'], text = final_dataset['LOCATION'], mode="markers"),
    row=1, col=2
)

fig5.update_xaxes(title_text="Tax Revenue (% of GDP)", row=1, col=1)
fig5.update_xaxes(title_text="HDI (2018)", row=1, col=2)

fig5.update_yaxes(title_text="Gini Ratio", row=1, col=1)
fig5.update_yaxes(title_text="Gini Ratio", row=1, col=2)

fig5.update_layout(height=600, width=1000, showlegend=False)
fig5.show()

Backward Selection for Linear Model

We look for a model where all predictors have p-value below the threshold $\alpha = 0.05$

In [21]:
# initialize and fit model containing all predictors, then remove predictor with highest p-value above alpha = .05
# to create next model

y = final_dataset['Gini']
X1 = final_dataset[['Tax Revenue (% of GDP)','Productivity (GDP/Labor Hr)', 'Happiness Score', 'HDI (2018)']]

X1 = sm.add_constant(X1)
full_model = sm.OLS(y, X1)
results = full_model.fit()
In [22]:
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Gini   R-squared:                       0.602
Model:                            OLS   Adj. R-squared:                  0.547
Method:                 Least Squares   F-statistic:                     10.96
Date:                Mon, 14 Feb 2022   Prob (F-statistic):           1.55e-05
Time:                        10:16:18   Log-Likelihood:                 67.842
No. Observations:                  34   AIC:                            -125.7
Df Residuals:                      29   BIC:                            -118.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
===============================================================================================
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           1.0455      0.215      4.863      0.000       0.606       1.485
Tax Revenue (% of GDP)         -0.0046      0.001     -4.590      0.000      -0.007      -0.003
Productivity (GDP/Labor Hr)    -0.0022      0.001     -1.448      0.158      -0.005       0.001
Happiness Score                 0.0111      0.012      0.959      0.345      -0.013       0.035
HDI (2018)                     -0.4694      0.189     -2.480      0.019      -0.856      -0.082
==============================================================================
Omnibus:                        3.889   Durbin-Watson:                   1.534
Prob(Omnibus):                  0.143   Jarque-Bera (JB):                2.471
Skew:                          -0.554   Prob(JB):                        0.291
Kurtosis:                       3.718   Cond. No.                     4.47e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.47e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [23]:
X2 = final_dataset[['Tax Revenue (% of GDP)', 'Productivity (GDP/Labor Hr)', 'HDI (2018)']]

X2 = sm.add_constant(X2)
model1 = sm.OLS(y, X2)
results1 = model1.fit()
print(results1.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Gini   R-squared:                       0.589
Model:                            OLS   Adj. R-squared:                  0.548
Method:                 Least Squares   F-statistic:                     14.34
Date:                Mon, 14 Feb 2022   Prob (F-statistic):           5.63e-06
Time:                        10:16:18   Log-Likelihood:                 67.311
No. Observations:                  34   AIC:                            -126.6
Df Residuals:                      30   BIC:                            -120.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===============================================================================================
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           1.0596      0.214      4.947      0.000       0.622       1.497
Tax Revenue (% of GDP)         -0.0046      0.001     -4.654      0.000      -0.007      -0.003
Productivity (GDP/Labor Hr)    -0.0024      0.001     -1.671      0.105      -0.005       0.001
HDI (2018)                     -0.3686      0.157     -2.345      0.026      -0.690      -0.048
==============================================================================
Omnibus:                        3.369   Durbin-Watson:                   1.485
Prob(Omnibus):                  0.186   Jarque-Bera (JB):                2.235
Skew:                          -0.276   Prob(JB):                        0.327
Kurtosis:                       4.129   Cond. No.                     4.38e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.38e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [24]:
X3 = final_dataset[['Tax Revenue (% of GDP)', 'HDI (2018)']]

X3 = sm.add_constant(X3)
model2 = sm.OLS(y, X3)
results2 = model2.fit()
print(results2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Gini   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.522
Method:                 Least Squares   F-statistic:                     19.02
Date:                Mon, 14 Feb 2022   Prob (F-statistic):           4.08e-06
Time:                        10:16:18   Log-Likelihood:                 65.799
No. Observations:                  34   AIC:                            -125.6
Df Residuals:                      31   BIC:                            -121.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.7763      0.135      5.769      0.000       0.502       1.051
Tax Revenue (% of GDP)    -0.0041      0.001     -4.234      0.000      -0.006      -0.002
HDI (2018)                -0.3541      0.161     -2.193      0.036      -0.683      -0.025
==============================================================================
Omnibus:                        2.798   Durbin-Watson:                   1.721
Prob(Omnibus):                  0.247   Jarque-Bera (JB):                1.555
Skew:                          -0.358   Prob(JB):                        0.460
Kurtosis:                       3.764   Cond. No.                     1.16e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.16e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

for the model: $$\widehat{\text{Gini}} = 0.7763 -.0041 \cdot \text{Tax Revenue (% of GDP)} -.03541 \cdot \text{HDI}$$
the coefficients have P-values below the threshold $\alpha = 0.05$, so we reject the null hypothesis that the coefficients are equal to zero. In addition, the F-test statistic is in the rejection region, so there is evidence that at least one of the predictors affects $\text{Gini}$.

We also have $R_\text{adj}^2 = .522$, so approximately 52% of the variation in $\text{Gini}$ can be accounted for by the explanatory variables.

In [25]:
fig6 = plt.figure(figsize=(9,6))
fig6 = sm.graphics.plot_regress_exog(results2, 'Tax Revenue (% of GDP)', fig=fig6)
In [26]:
fig7 = plt.figure(figsize=(9,6))
fig7 = sm.graphics.plot_regress_exog(results2, 'HDI (2018)', fig=fig7)
In [27]:
px.histogram(x=results2.resid, labels = {'x':'Residuals'}, title = 'Histogram of Model Residuals')
In [28]:
npp = sm.ProbPlot(results2.resid)
fig8 = npp.qqplot(line='s')
plt.title('Normal Probability Plot')
plt.show()
In [29]:
from scipy import stats
stats.shapiro(results2.resid)
Out[29]:
(0.9692316055297852, 0.44060808420181274)

$p = 0.44 > .05$ so we fail to reject null hypothesis. There is not enough evidence to suggest that the residuals are not normally distributed.

Neither of the residual plots show any clear pattern or clear signs of heteroscedasticity. In addition, the residuals are normally distributed so the linear model is appropriate.